--! This file is part of the FELIX firmware distribution (https://gitlab.cern.ch/atlas-tdaq-felix/firmware/).
--! Copyright (C) 2001-2021 CERN for the benefit of the ATLAS collaboration.
--! Authors:
--!               Israel Grayzman
--!               RHabraken
--!               Thei Wijnen
--!               Frans Schreuder
--!
--!   Licensed under the Apache License, Version 2.0 (the "License");
--!   you may not use this file except in compliance with the License.
--!   You may obtain a copy of the License at
--!
--!       http://www.apache.org/licenses/LICENSE-2.0
--!
--!   Unless required by applicable law or agreed to in writing, software
--!   distributed under the License is distributed on an "AS IS" BASIS,
--!   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--!   See the License for the specific language governing permissions and
--!   limitations under the License.


--!------------------------------------------------------------------------------
--!
--!           NIKHEF - National Institute for Subatomic Physics
--!
--!                       Electronics Department
--!
--!-----------------------------------------------------------------------------
--! @class dma_read_write
--!
--!
--! @author      Andrea Borga    (andrea.borga@nikhef.nl)<br>
--!              Frans Schreuder (frans.schreuder@nikhef.nl)
--!
--!
--! @date        07/01/2015    created
--!
--! @version     1.0
--!
--! @brief
--! dma_read_write contains the actual DMA state machines, it processes the descriptors
--! and reads from and writes to the PC memory if there is data in the fifo.
--!
--!
--! @detail
--!
--!-----------------------------------------------------------------------------
--! @TODO
--!
--!
--! ------------------------------------------------------------------------------
--! Wupper: PCIe Gen3 and Gen4 DMA Core for Xilinx FPGAs
--!
--! Copyright (C) 2021 Nikhef, Amsterdam (f.schreuder@nikhef.nl)
--!
--! Licensed under the Apache License, Version 2.0 (the "License");
--! you may not use this file except in compliance with the License.
--! You may obtain a copy of the License at
--!
--!         http://www.apache.org/licenses/LICENSE-2.0
--!
--! Unless required by applicable law or agreed to in writing, software
--! distributed under the License is distributed on an "AS IS" BASIS,
--! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--! See the License for the specific language governing permissions and
--! limitations under the License.
--
--! @brief ieee



library ieee, UNISIM;
    use ieee.numeric_std.all;
    use UNISIM.VCOMPONENTS.all;
    use ieee.numeric_std_unsigned.all;
    use ieee.std_logic_1164.all;
    use work.pcie_package.all;
Library xpm;
    use xpm.vcomponents.all;

entity dma_read_write is
    generic(
        NUMBER_OF_DESCRIPTORS : integer := 2;
        DATA_WIDTH            : integer := 512
    );
    port (
        clk                     : in     std_logic;
        dma_descriptors         : in     dma_descriptors_type(0 to (NUMBER_OF_DESCRIPTORS));
        dma_soft_reset          : in     std_logic;
        dma_status              : out    dma_statuses_type(0 to (NUMBER_OF_DESCRIPTORS));
        fromHostFifoIndex       : out    integer range 0 to 0;
        fromHostFifo_din        : out    std_logic_vector(DATA_WIDTH-1 downto 0);
        fromHostFifo_prog_full  : in     std_logic;
        fromHostFifo_full       : in     std_logic;
        fromHostFifo_we         : out    std_logic;
        m_axis_r_rq             : in     axis_r_type;
        m_axis_rq               : out    axis_type;
        reset                   : in     std_logic;
        s_axis_r_rc             : out    axis_r_type;
        s_axis_rc               : in     axis_type;
        toHostFifoIndex         : out    integer range 0 to NUMBER_OF_DESCRIPTORS-2;
        toHostFifo_dout         : in     std_logic_vector(DATA_WIDTH-1 downto 0);
        toHostFifo_empty_thresh : out    slv12_array(0 to NUMBER_OF_DESCRIPTORS-2);
        toHostFifo_prog_empty   : in     std_logic_vector(NUMBER_OF_DESCRIPTORS-2 downto 0);
        toHostFifo_re           : out    std_logic);
end entity dma_read_write;



architecture rtl of dma_read_write is
    constant NUMBER_OF_DESCRIPTORS_TOHOST: integer := NUMBER_OF_DESCRIPTORS -1;
    --constant NUMBER_OF_DESCRIPTORS_FROMHOST: integer := 1;

    type rw_state_type is(IDLE, CONT_WRITE, DELAY);
    signal rw_state: rw_state_type := IDLE;

    --type strip_state_type is(IDLE, PUSH_DATA);
    --signal strip_state: strip_state_type := IDLE;
    --signal toHostFifo_dout_pipe: std_logic_vector(127 downto 0); --pipe part of the fifo data 1 clock cycle for 256 bit alignment
    signal mem_dina_pipe: std_logic_vector(DATA_WIDTH-97 downto 0);  --pipe part of the fifo data 1 clock cycle for 256 bit alignment
    signal mem_dina_pipe_valid: std_logic;
    constant req_tc: std_logic_vector (2 downto 0) := "000";
    constant req_attr: std_logic_vector(2 downto 0) := "000"; --ID based ordering, Relaxed ordering, No Snoop (should be "001"?)
    --signal receive_word_count: std_logic_vector(10 downto 0);
    signal active_descriptor_s: integer range 0 to (NUMBER_OF_DESCRIPTORS);
    signal toHostFifoIndex_s    :   integer range 0 to NUMBER_OF_DESCRIPTORS_TOHOST-1;
    signal toHostFifo_re_s : std_logic;

    signal s_m_axis_rq : axis_type;
    signal evencycle_dma_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    signal dma_wait_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    signal dma_wait_pc_pointer_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    signal dma_wait_next_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    type slv64_arr is array(0 to (NUMBER_OF_DESCRIPTORS)) of std_logic_vector(63 downto 0);
    signal next_address_s           : slv64_arr;
    signal current_address_s           : slv64_arr;
    signal address_wrapped_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    signal ToHostWriteCount_s, ToHostWriteCount_p1_s: std_logic_vector(7 downto 0); --Supports max 128x256 or for 512b, count 2 per cycle to account for straddling, max 4096byte TLP
    signal rc_tready_s : std_logic;
    --signal current_address: std_logic_vector(63 downto 0);

    signal mem_doutb : std_logic_vector(DATA_WIDTH-1 downto 0);
    signal mem_addra : std_logic_vector(14-f_log2(DATA_WIDTH) downto 0);
    signal mem_addra_next : std_logic_vector(14-f_log2(DATA_WIDTH) downto 0);
    signal mem_addra_next_valid : std_logic;
    signal mem_addrb : std_logic_vector(14-f_log2(DATA_WIDTH) downto 0);
    signal mem_dina  : std_logic_vector(DATA_WIDTH-1 downto 0);
    signal mem_wea   : std_logic_vector(0 downto 0);
    signal mem_full  : std_logic_vector((32768/DATA_WIDTH)-1 downto 0);
    signal mem_full_p1  : std_logic_vector((32768/DATA_WIDTH)-1 downto 0);
    signal reading_mem : std_logic;
    --signal fromHostFifo_we_p0 : std_logic;
    signal clear_wait_for_4k_boundary : std_logic;

    signal StartSearchingAt: integer range 0 to (NUMBER_OF_DESCRIPTORS-1)*2-1;
    signal do_re_fifo: std_logic;
    signal toHostFifo_prog_empty_s: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    signal next_address_equals_end_address: std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
    type IntArray_type is array(natural range <>) of integer;
    --! Alternate between a one of the ToHost descriptors (round robin) and every other turn we select the FromHost descriptor.
    constant RoundRobinLookup : IntArray_type(0 to 15) := (0,NUMBER_OF_DESCRIPTORS-1,
                                                            1,NUMBER_OF_DESCRIPTORS-1,
                                                            2,NUMBER_OF_DESCRIPTORS-1,
                                                            3,NUMBER_OF_DESCRIPTORS-1,
                                                            4,NUMBER_OF_DESCRIPTORS-1,
                                                            5,NUMBER_OF_DESCRIPTORS-1,
                                                            6,NUMBER_OF_DESCRIPTORS-1,
                                                            7,NUMBER_OF_DESCRIPTORS-1);

    type axis_rq_tuser_bits_type is record
        first_be: std_logic_vector(15 downto 0); --3:0 for 256b, 7:0 for 512b, 15:0 for 1024b
        last_be:  std_logic_vector(15 downto 0); --7:3 for 256b, 15:8 for 512b, 31:16 for 1024b
        addr_offset: std_logic_vector(15 downto 0); --10:8 for 256, 19:16 for 512b, unused, set to "0000"
        is_sop: std_logic_vector(3 downto 0); --21:20 for 512b
        is_sop0_ptr: std_logic_vector(1 downto 0); --23:22 for 512b
        is_sop1_ptr: std_logic_vector(1 downto 0); --25:24 for 512b
        is_sop2_ptr: std_logic_vector(1 downto 0); --Only for 1024b
        is_sop3_ptr: std_logic_vector(1 downto 0); --Only for 1024b
        is_eop: std_logic_vector(3 downto 0); --27:26 for 512b
        is_eop0_ptr: std_logic_vector(4 downto 0); --31:28 for 512b
        is_eop1_ptr: std_logic_vector(4 downto 0); --35:32 for 512b
        is_eop2_ptr: std_logic_vector(4 downto 0); --Only for 1024b
        is_eop3_ptr: std_logic_vector(4 downto 0); --Only for 1024b
        discontinue: std_logic; --11 for 256, 36 for 512, unused set to '0'
        tph_present: std_logic_vector(1 downto 0); --12 for 256b, 38:37 for 512b, set to "00"
        tph_type: std_logic_vector(3 downto 0);  --14:13 for 256, 42:39 for 512b, set to "0000"
        tph_indirect_tag_en: std_logic_vector(1 downto 0); --15 for 256b, 44:43 for 512b, set to "00"
        tph_st_tag: std_logic_vector(15 downto 0); --23:16 for 256b, 60:45 for 512b, set to x"0000"
        seq_num0: std_logic_vector(5 downto 0); --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        seq_num1: std_logic_vector(5 downto 0); --72:67 for 512b, set to "000000"
        seq_num2: std_logic_vector(5 downto 0); --Only for 1024b
        seq_num3: std_logic_vector(5 downto 0); --Only for 1024b
        parity: std_logic_vector(127 downto 0); --59:28 for 256b, 136:73 for 512b, set to x"0000_0000_0000_0000"
    end record;

    signal axis_rq_tuser_bits: axis_rq_tuser_bits_type;

    type axis_rc_tuser_bits_type is record
        byte_en: std_logic_vector(127 downto 0); --31:0 for 256b, 63:0 for 512b, 127:0 for 1024b
        is_sop:  std_logic_vector(7 downto 0);  --33:32 for 256b, 76:64 for 512b,
        is_sop0_ptr: std_logic_vector(2 downto 0); --69:68 for 512b
        is_sop1_ptr: std_logic_vector(2 downto 0); --71:70 for 512b
        is_sop2_ptr: std_logic_vector(2 downto 0); --73:72 for 512b
        is_sop3_ptr: std_logic_vector(2 downto 0); --75:74 for 512b
        is_sop4_ptr: std_logic_vector(2 downto 0); --Only for 1024 bit / gen5
        is_sop5_ptr: std_logic_vector(2 downto 0); --Only for 1024 bit / gen5
        is_sop6_ptr: std_logic_vector(2 downto 0); --Only for 1024 bit / gen5
        is_sop7_ptr: std_logic_vector(2 downto 0); --Only for 1024 bit / gen5
        is_eop: std_logic_vector(7 downto 0); --37:34 for 256b, 79:76 for 512b.
        is_eop0_ptr: std_logic_vector(4 downto 0); --83:80 for 512b, part of is_eop0[3:1] for 256b
        is_eop1_ptr: std_logic_vector(4 downto 0); --87:84 for 512b, part of is_eop1[3:1] for 256b
        is_eop2_ptr: std_logic_vector(4 downto 0); --91:88 for 512b, NA for 256b
        is_eop3_ptr: std_logic_vector(4 downto 0); --95:92 for 512b, NA for 256b
        is_eop4_ptr: std_logic_vector(4 downto 0); --Only for 1024 bit / gen5
        is_eop5_ptr: std_logic_vector(4 downto 0); --Only for 1024 bit / gen5
        is_eop6_ptr: std_logic_vector(4 downto 0); --Only for 1024 bit / gen5
        is_eop7_ptr: std_logic_vector(4 downto 0); --Only for 1024 bit / gen5
        discontinue: std_logic; --42 for 256, 96 for 512, unused set to '0'
        parity: std_logic_vector(127 downto 0); --74:43 for 256b, 160:97 for 512b, set to x"0000_0000_0000_0000"
    end record;

    signal axis_rc_tuser_bits: axis_rc_tuser_bits_type;

    signal rq_header : std_logic_vector(127 downto 0); --Contains the PCIe TLP header which is transmitted before the payload on the RQ AXIs bus.
    signal rq_data, rq_data_p1: std_logic_vector(DATA_WIDTH-1 downto 0);
    signal rq_data_valid, rq_data_valid_p1, rq_header_valid: std_logic;
    signal add_delay_cycle: std_logic;
    --signal rq_straddling: std_logic; --for 512b interface only: 0: TLP starts at bit 0, 1: TLP starts at bit 256
    signal nextFromHostIndex: integer range 0 to 1;

begin

    toHostFifoIndex <= toHostFifoIndex_s;
    fromHostFifoIndex <= 0; --constant for now, keep it to 1 fromHost descriptor.

    m_axis_rq <= s_m_axis_rq;

    axis_rq_tuser_bits.first_be <= x"FFFF"; --We send only complete DWORDS to DMA, all bytes are always enabled.
    axis_rq_tuser_bits.last_be  <= x"FFFF";
    axis_rq_tuser_bits.addr_offset <= x"0000";
    axis_rq_tuser_bits.discontinue <= '0';
    axis_rq_tuser_bits.tph_present <= "00";
    axis_rq_tuser_bits.tph_type <= "0000";
    axis_rq_tuser_bits.tph_indirect_tag_en <= "00";
    axis_rq_tuser_bits.tph_st_tag <= x"0000";
    axis_rq_tuser_bits.seq_num0 <= "000000";
    axis_rq_tuser_bits.seq_num1 <= "000000";
    axis_rq_tuser_bits.seq_num2 <= "000000";
    axis_rq_tuser_bits.seq_num3 <= "000000";
    axis_rq_tuser_bits.parity <= x"0000_0000_0000_0000_0000_0000_0000_0000";

    g_tuserbits_256: if DATA_WIDTH = 256 generate
        s_m_axis_rq.tuser(3 downto 0)    <= axis_rq_tuser_bits.first_be(3 downto 0);    --3:0 for 256b, 7:0 for 512b
        s_m_axis_rq.tuser(7 downto 4)    <= axis_rq_tuser_bits.last_be(3 downto 0);     --7:3 for 256b, 15:8 for 512b
        s_m_axis_rq.tuser(10 downto 8)   <= axis_rq_tuser_bits.addr_offset(2 downto 0); --10:8 for 256, 19:16 for 512b, unused, set to "0000"
        s_m_axis_rq.tuser(11)            <= axis_rq_tuser_bits.discontinue;             --11 for 256, 36 for 512, unused set to '0'
        s_m_axis_rq.tuser(12)            <= axis_rq_tuser_bits.tph_present(0);          --12 for 256b, 38:37 for 512b, set to "00"
        s_m_axis_rq.tuser(14 downto 13)  <= axis_rq_tuser_bits.tph_type(1 downto 0);    --14:13 for 256, 42:39 for 512b, set to "0000"
        s_m_axis_rq.tuser(15)            <= axis_rq_tuser_bits.tph_indirect_tag_en(0);  --15 for 256b, 44:43 for 512b, set to "00"
        s_m_axis_rq.tuser(23 downto 16)  <= axis_rq_tuser_bits.tph_st_tag(7 downto 0);  --23:16 for 256b, 60:45 for 512b, set to x"0000"
        s_m_axis_rq.tuser(27 downto 24)  <= axis_rq_tuser_bits.seq_num0(3 downto 0);    --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        s_m_axis_rq.tuser(59 downto 28)  <= axis_rq_tuser_bits.parity(31 downto 0);     --59:28 for 256b, 136:73 for 512b, set to x"0000_0000_0000_0000"
        s_m_axis_rq.tuser(61 downto 60)  <= axis_rq_tuser_bits.seq_num0(5 downto 4);    --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        s_m_axis_rq.tuser(182 downto 61) <= (others => '0');
        s_m_axis_rq.tdata(511 downto 256) <= (others => '0');

        axis_rc_tuser_bits.byte_en <= x"0000_0000_0000_0000_0000_0000" & s_axis_rc.tuser(31 downto 0);
        axis_rc_tuser_bits.is_sop <= "000000" & s_axis_rc.tuser(33 downto 32);
        axis_rc_tuser_bits.is_sop0_ptr <= "00"&s_axis_rc.tuser(34); -- At position 1 if is_eop is 1 (tuser 34), otherwise position 0
        axis_rc_tuser_bits.is_sop1_ptr <= "001"; --Always at bit position 128, byte lane 16
        axis_rc_tuser_bits.is_sop2_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_sop3_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_sop4_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_sop5_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_sop6_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_sop7_ptr <= "000"; --NA for 256b
        axis_rc_tuser_bits.is_eop <= "000000" & s_axis_rc.tuser(38) & s_axis_rc.tuser(34);
        axis_rc_tuser_bits.is_eop0_ptr <= "00" & s_axis_rc.tuser(37 downto 35);
        axis_rc_tuser_bits.is_eop1_ptr <= "00" & s_axis_rc.tuser(41 downto 39);
        axis_rc_tuser_bits.is_eop2_ptr <= "00000";
        axis_rc_tuser_bits.is_eop3_ptr <= "00000";
        axis_rc_tuser_bits.is_eop4_ptr <= "00000";
        axis_rc_tuser_bits.is_eop5_ptr <= "00000";
        axis_rc_tuser_bits.is_eop6_ptr <= "00000";
        axis_rc_tuser_bits.is_eop7_ptr <= "00000";
        axis_rc_tuser_bits.discontinue <= s_axis_rc.tuser(42);
        axis_rc_tuser_bits.parity      <= x"0000_0000_0000_0000_0000_0000" & s_axis_rc.tuser(74 downto 43);  --74:43 for 256b, 160:97 for 512b, set to x"0000_0000_0000_0000"

    end generate;
    g_tuserbits_512: if DATA_WIDTH = 512 generate
        s_m_axis_rq.tuser(7 downto 0)     <= axis_rq_tuser_bits.first_be(7 downto 0);            --3:0 for 256b, 7:0 for 512b
        s_m_axis_rq.tuser(15 downto 8)    <= axis_rq_tuser_bits.last_be(7 downto 0);             --7:3 for 256b, 15:8 for 512b
        s_m_axis_rq.tuser(19 downto 16)   <= axis_rq_tuser_bits.addr_offset(3 downto 0);         --10:8 for 256, 19:16 for 512b, unused, set to "0000"
        s_m_axis_rq.tuser(21 downto 20)   <= axis_rq_tuser_bits.is_sop(1 downto 0);              --21:20 for 512b
        s_m_axis_rq.tuser(23 downto 22)   <= axis_rq_tuser_bits.is_sop0_ptr;         --23:22 for 512b
        s_m_axis_rq.tuser(25 downto 24)   <= axis_rq_tuser_bits.is_sop1_ptr;         --25:24 for 512b
        s_m_axis_rq.tuser(27 downto 26)   <= axis_rq_tuser_bits.is_eop(1 downto 0);              --27:26 for 512b
        s_m_axis_rq.tuser(31 downto 28)   <= axis_rq_tuser_bits.is_eop0_ptr(3 downto 0);         --31:28 for 512b
        s_m_axis_rq.tuser(35 downto 32)   <= axis_rq_tuser_bits.is_eop1_ptr(3 downto 0);         --35:32 for 512b
        s_m_axis_rq.tuser(36)             <= axis_rq_tuser_bits.discontinue;         --11 for 256, 36 for 512, unused set to '0'
        s_m_axis_rq.tuser(38 downto 37)   <= axis_rq_tuser_bits.tph_present;         --12 for 256b, 38:37 for 512b, set to "00"
        s_m_axis_rq.tuser(42 downto 39)   <= axis_rq_tuser_bits.tph_type;            --14:13 for 256, 42:39 for 512b, set to "0000"
        s_m_axis_rq.tuser(44 downto 43)   <= axis_rq_tuser_bits.tph_indirect_tag_en; --15 for 256b, 44:43 for 512b, set to "00"
        s_m_axis_rq.tuser(60 downto 45)   <= axis_rq_tuser_bits.tph_st_tag;          --23:16 for 256b, 60:45 for 512b, set to x"0000"
        s_m_axis_rq.tuser(66 downto 61)   <= axis_rq_tuser_bits.seq_num0;            --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        s_m_axis_rq.tuser(72 downto 67)   <= axis_rq_tuser_bits.seq_num1;            --72:67 for 512b, set to "000000"
        s_m_axis_rq.tuser(136 downto 73)  <= axis_rq_tuser_bits.parity(63 downto 0);              --59:28 for 256b, 136:73 for 512b, set to x"0000_0000_0000_0000"
        s_m_axis_rq.tuser(182 downto 137) <= (others => '0');                        --182 downto 137 for versal devices.

        axis_rc_tuser_bits.byte_en     <= x"0000_0000_0000_0000" & s_axis_rc.tuser(63 downto 0);
        axis_rc_tuser_bits.is_sop      <= x"0" & s_axis_rc.tuser(67 downto 64);
        axis_rc_tuser_bits.is_sop0_ptr <= "0" & s_axis_rc.tuser(69 downto 68);
        axis_rc_tuser_bits.is_sop1_ptr <= "0" & s_axis_rc.tuser(71 downto 70);
        axis_rc_tuser_bits.is_sop2_ptr <= "0" & s_axis_rc.tuser(73 downto 72);
        axis_rc_tuser_bits.is_sop3_ptr <= "0" & s_axis_rc.tuser(75 downto 74);
        axis_rc_tuser_bits.is_eop      <= x"0" & s_axis_rc.tuser(79 downto 76);
        axis_rc_tuser_bits.is_eop0_ptr <= "0" & s_axis_rc.tuser(83 downto 80);
        axis_rc_tuser_bits.is_eop1_ptr <= "0" & s_axis_rc.tuser(87 downto 84);
        axis_rc_tuser_bits.is_eop2_ptr <= "0" & s_axis_rc.tuser(91 downto 88);
        axis_rc_tuser_bits.is_eop3_ptr <= "0" & s_axis_rc.tuser(95 downto 92);
        axis_rc_tuser_bits.discontinue <= s_axis_rc.tuser(96);
        axis_rc_tuser_bits.parity      <= x"0000_0000_0000_0000" & s_axis_rc.tuser(160 downto 97);  --74:43 for 256b, 160:97 for 512b, set to x"0000_0000_0000_0000"
    end generate;
    g_tuserbits_1024: if DATA_WIDTH = 1024 generate
        s_m_axis_rq.tuser(15 downto 0)    <= axis_rq_tuser_bits.first_be;            --3:0 for 256b, 7:0 for 512b
        s_m_axis_rq.tuser(31 downto 16)   <= axis_rq_tuser_bits.last_be;             --7:3 for 256b, 15:8 for 512b
        s_m_axis_rq.tuser(47 downto 32)   <= axis_rq_tuser_bits.addr_offset;         --10:8 for 256, 19:16 for 512b, unused, set to "0000"
        s_m_axis_rq.tuser(51 downto 48)   <= axis_rq_tuser_bits.is_sop;              --21:20 for 512b
        s_m_axis_rq.tuser(53 downto 52)   <= axis_rq_tuser_bits.is_sop0_ptr;         --23:22 for 512b
        s_m_axis_rq.tuser(55 downto 54)   <= axis_rq_tuser_bits.is_sop1_ptr;         --25:24 for 512b
        s_m_axis_rq.tuser(57 downto 56)   <= axis_rq_tuser_bits.is_sop2_ptr;         --23:22 for 512b
        s_m_axis_rq.tuser(59 downto 58)   <= axis_rq_tuser_bits.is_sop3_ptr;         --25:24 for 512b
        s_m_axis_rq.tuser(63 downto 60)   <= axis_rq_tuser_bits.is_eop;              --27:26 for 512b
        s_m_axis_rq.tuser(68 downto 64)   <= axis_rq_tuser_bits.is_eop0_ptr;         --31:28 for 512b
        s_m_axis_rq.tuser(73 downto 69)   <= axis_rq_tuser_bits.is_eop1_ptr;         --35:32 for 512b
        s_m_axis_rq.tuser(78 downto 74)   <= axis_rq_tuser_bits.is_eop2_ptr;         --31:28 for 512b
        s_m_axis_rq.tuser(83 downto 79)   <= axis_rq_tuser_bits.is_eop3_ptr;         --35:32 for 512b
        s_m_axis_rq.tuser(84)             <= axis_rq_tuser_bits.discontinue;         --11 for 256, 36 for 512, unused set to '0'
        s_m_axis_rq.tuser(128 downto 85 ) <= (others => '0'); --reserved
        s_m_axis_rq.tuser(256 downto 129)  <= axis_rq_tuser_bits.parity;              --59:28 for 256b, 136:73 for 512b, set to x"0000_0000_0000_0000"
        s_m_axis_rq.tuser(348 downto 257) <= (others => '0');
        --s_m_axis_rq.tuser(38 downto 37)   <= axis_rq_tuser_bits.tph_present;         --12 for 256b, 38:37 for 512b, set to "00"
        --s_m_axis_rq.tuser(42 downto 39)   <= axis_rq_tuser_bits.tph_type;            --14:13 for 256, 42:39 for 512b, set to "0000"
        --s_m_axis_rq.tuser(44 downto 43)   <= axis_rq_tuser_bits.tph_indirect_tag_en; --15 for 256b, 44:43 for 512b, set to "00"
        --s_m_axis_rq.tuser(60 downto 45)   <= axis_rq_tuser_bits.tph_st_tag;          --23:16 for 256b, 60:45 for 512b, set to x"0000"
        s_m_axis_rq.tuser(354 downto 349)   <= axis_rq_tuser_bits.seq_num0;            --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        s_m_axis_rq.tuser(360 downto 355)   <= axis_rq_tuser_bits.seq_num1;            --72:67 for 512b, set to "000000"
        s_m_axis_rq.tuser(366 downto 361)   <= axis_rq_tuser_bits.seq_num2;            --61:60,27:24 for 256b, 66:61 for 512b, set to "000000"
        s_m_axis_rq.tuser(372 downto 367)   <= axis_rq_tuser_bits.seq_num3;            --72:67 for 512b, set to "000000"
        s_m_axis_rq.tuser(464 downto 373) <= (others => '0');                        --182 downto 137 for versal devices.

        axis_rc_tuser_bits.byte_en     <= s_axis_rc.tuser(127 downto 0);
        axis_rc_tuser_bits.is_sop      <= s_axis_rc.tuser(135 downto 128);
        axis_rc_tuser_bits.is_sop0_ptr <= s_axis_rc.tuser(138 downto 136);
        axis_rc_tuser_bits.is_sop1_ptr <= s_axis_rc.tuser(141 downto 139);
        axis_rc_tuser_bits.is_sop2_ptr <= s_axis_rc.tuser(144 downto 142);
        axis_rc_tuser_bits.is_sop3_ptr <= s_axis_rc.tuser(147 downto 145);
        axis_rc_tuser_bits.is_sop4_ptr <= s_axis_rc.tuser(150 downto 148);
        axis_rc_tuser_bits.is_sop5_ptr <= s_axis_rc.tuser(153 downto 151);
        axis_rc_tuser_bits.is_sop6_ptr <= s_axis_rc.tuser(156 downto 154);
        axis_rc_tuser_bits.is_sop7_ptr <= s_axis_rc.tuser(159 downto 157);
        axis_rc_tuser_bits.is_eop      <= s_axis_rc.tuser(167 downto 160);
        axis_rc_tuser_bits.is_eop0_ptr <= s_axis_rc.tuser(172 downto 168);
        axis_rc_tuser_bits.is_eop1_ptr <= s_axis_rc.tuser(177 downto 173);
        axis_rc_tuser_bits.is_eop2_ptr <= s_axis_rc.tuser(182 downto 178);
        axis_rc_tuser_bits.is_eop3_ptr <= s_axis_rc.tuser(187 downto 183);
        axis_rc_tuser_bits.is_eop4_ptr <= s_axis_rc.tuser(192 downto 188);
        axis_rc_tuser_bits.is_eop5_ptr <= s_axis_rc.tuser(197 downto 193);
        axis_rc_tuser_bits.is_eop6_ptr <= s_axis_rc.tuser(202 downto 198);
        axis_rc_tuser_bits.is_eop7_ptr <= s_axis_rc.tuser(207 downto 203);
        axis_rc_tuser_bits.discontinue <= s_axis_rc.tuser(208);
        axis_rc_tuser_bits.parity      <= s_axis_rc.tuser(336 downto 209);  --74:43 for 256b, 160:97 for 512b, set to x"0000_0000_0000_0000"
    end generate;

    re_proc: process(rw_state,  m_axis_r_rq, dma_descriptors, active_descriptor_s, dma_wait_s, do_re_fifo, toHostFifo_prog_empty_s)
    begin
        if rw_state = IDLE then
            toHostFifo_re_s <= '0';
            if((toHostFifo_prog_empty_s(active_descriptor_s) = '0') and (m_axis_r_rq.tready = '1')) then
                if((dma_descriptors(active_descriptor_s).enable = '1') and dma_wait_s(active_descriptor_s) = '0') then
                    toHostFifo_re_s <= '1';
                end if;
            end if;
        else
            toHostFifo_re_s <= do_re_fifo and m_axis_r_rq.tready;
        end if;
    end process;

    toHostFifo_re <= toHostFifo_re_s;

    thresh: process(clk)
        variable wc: std_logic_vector(10 downto 0);
        variable th: std_logic_vector(7 downto 0);
    begin
        if rising_edge(clk) then
            for i in 0 to NUMBER_OF_DESCRIPTORS-2 loop
                if toHostFifo_re_s = '1' then
                    wc := dma_descriptors(i).dword_count; --Increase the threshold by 1 to make sure we don't overread
                else
                    wc := dma_descriptors(i).dword_count - 1; --32b words
                end if;
                if DATA_WIDTH = 256 then
                    th := wc(10 downto 3); --256b fifo data width
                elsif DATA_WIDTH = 512 then
                    th := "0"&wc(10 downto 4); --512b fifo data width
                else
                    th := "00"&wc(10 downto 5); --1024b fifo data width
                end if;
                toHostFifo_empty_thresh(i) <= "0000"&th;
            end loop;
        end if;
    end process;

    toHostFifo_prog_empty_s <= "11"&toHostFifo_prog_empty;

    add_header: process(clk)
        variable next_active_descriptor_v: integer range 0 to (NUMBER_OF_DESCRIPTORS);
        variable start_transfer : std_logic := '0';
        variable advance_address, advance_address_p1 : std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
        variable wait_for_4k_boundary : std_logic_vector(1 downto 0);
        variable searchIndex : integer range 0 to NUMBER_OF_DESCRIPTORS;
        variable current_address_equals_pc_pointer : std_logic_vector(NUMBER_OF_DESCRIPTORS downto 0);
        variable FromHostIndex_v: integer range 0 to 1; --To select one of the 2 active FromHost descriptors

    begin
        if(rising_edge(clk)) then
            if(reset = '1') or (dma_soft_reset = '1') then
                do_re_fifo <= '0';
                rw_state <= IDLE;
                ToHostWriteCount_s <= x"00";
                evencycle_dma_s <= (others => '0');
                active_descriptor_s <= 0;
                wait_for_4k_boundary := (others => '0');
                toHostFifoIndex_s <= 0;
                StartSearchingAt <= 0;
                dma_wait_next_s <= (others => '0');
                address_wrapped_s <= (others => '0');
                --Only applies to 512b and frame straddle, tlast and tkeep are only used for 256b
                axis_rq_tuser_bits.is_sop <= "0000";
                axis_rq_tuser_bits.is_sop0_ptr <= "00";
                axis_rq_tuser_bits.is_sop1_ptr <= "00";
                axis_rq_tuser_bits.is_sop2_ptr <= "00";
                axis_rq_tuser_bits.is_sop3_ptr <= "00";
                axis_rq_tuser_bits.is_eop      <= "0000";
                axis_rq_tuser_bits.is_eop0_ptr <= "00000";
                axis_rq_tuser_bits.is_eop1_ptr <= "00000";
                axis_rq_tuser_bits.is_eop2_ptr <= "00000";
                axis_rq_tuser_bits.is_eop3_ptr <= "00000";
                next_address_equals_end_address <= (others => '0');
                s_m_axis_rq.tvalid <= '0';
                FromHostIndex_v := 0;
                nextFromHostIndex <= 0;
            else
                --If FromHost descriptor is disabled, clear wait_for_4k_boundary
                if (dma_descriptors(NUMBER_OF_DESCRIPTORS-1).enable='0') then
                    wait_for_4k_boundary(0) := '0';
                end if;
                if (dma_descriptors(NUMBER_OF_DESCRIPTORS).enable='0') then
                    wait_for_4k_boundary(1) := '0';
                end if;
                if active_descriptor_s = NUMBER_OF_DESCRIPTORS then
                    FromHostIndex_v := 1;
                elsif active_descriptor_s = NUMBER_OF_DESCRIPTORS-1 then
                    FromHostIndex_v := 0;
                end if;

                searchIndex := RoundRobinLookup(StartSearchingAt);
                --! Select either the first or the trickle FromHost descriptor.
                if searchIndex = NUMBER_OF_DESCRIPTORS-1 and nextFromHostIndex = 1 then
                    searchIndex := NUMBER_OF_DESCRIPTORS;
                end if;
                next_active_descriptor_v := active_descriptor_s;
                if((searchIndex /= active_descriptor_s) and (dma_descriptors(searchIndex).enable='1') and dma_wait_s(searchIndex) = '0') then
                    if(dma_descriptors(searchIndex).read_not_write = '0') then
                        if (toHostFifo_prog_empty_s(searchIndex) = '0') then
                            next_active_descriptor_v := searchIndex; --find another active descriptor, else just continue with the current descriptor. 0 has priority above 1 and so on.
                        end if;
                    end if;
                    if(((dma_descriptors(searchIndex).read_not_write = '1') and (fromHostFifo_prog_full = '0'))) then
                        next_active_descriptor_v :=searchIndex; --Regular fromhost descriptor
                    end if;
                end if;

                if(m_axis_r_rq.tready = '1') then
                    ToHostWriteCount_p1_s <= ToHostWriteCount_s;
                    s_m_axis_rq.tvalid <= '0';

                    s_m_axis_rq.tlast <= '0';
                    s_m_axis_rq.tkeep <= x"0000_0000";
                    for i in 0 to NUMBER_OF_DESCRIPTORS loop
                        dma_wait_s(i) <= not dma_descriptors(i).enable; --Start with dma_wait set if descriptor is not enabled
                    end loop;
                    dma_wait_pc_pointer_s <= (others => '0');


                    rq_data_valid <= toHostFifo_re_s;
                    rq_data_valid_p1 <= rq_data_valid;
                    rq_data <= toHostFifo_dout;
                    rq_data_p1 <= rq_data;

                    do_re_fifo <= '0';
                    advance_address := (others => '0');
                    case(rw_state) is
                        when IDLE =>
                            if toHostFifo_re_s = '0' and (DATA_WIDTH = 512 or DATA_WIDTH = 1024) then --If no data is being read, and we stay in idle (otherwise it is set later in this case) we cancel straddling.
                                ToHostWriteCount_s <= x"00";
                            end if;

                            if StartSearchingAt = (NUMBER_OF_DESCRIPTORS-1)*2-1 then
                                StartSearchingAt <= 0;
                            else
                                StartSearchingAt <= StartSearchingAt + 1;
                            end if;
                            add_delay_cycle <= '0';
                            start_transfer := '0';
                            if dma_descriptors(active_descriptor_s).enable = '1' then
                                if (dma_wait_s(active_descriptor_s) = '0') then
                                    -----DW 3
                                    rq_header <=    '0' &      --31 - 1 bit reserved
                                                 req_attr & --30-28 3 bits Attr
                                                 req_tc &   -- 27-25 3- bits
                                                 '0' &      -- 24 req_id enable
                                                 x"0000" &  --xcompleter_id_bus,    -- 23-16 Completer Bus number - selected if Compl ID    = 1
                                    --completer_id_dev_func, --15-8 Compl Dev / Func no - sel if Compl ID = 1
                                                 x"00" &    -- 7-0 Client Tag
                                    --DW 2
                                                 x"0000" &  --req_rid,       -- 31-16 Requester ID - 16 bits
                                                 '0' &      -- poisoned request 1'b0,          -- 15 Rsvd
                                                 "000" & not dma_descriptors(active_descriptor_s).read_not_write &   -- memory READ request (0) or memory write request (1).
                                                 dma_descriptors(active_descriptor_s).dword_count&  -- 10-0 DWord Count 0 - IO Write completions
                                    --DW 1-0
                                                 current_address_s(active_descriptor_s)(63 downto 2)&"00"; --62 bit word address address + 2 bit Address type (0, untranslated)
                                    rq_data <= toHostFifo_dout;
                                    if(((dma_descriptors(active_descriptor_s).read_not_write = '0') and (toHostFifo_prog_empty_s(active_descriptor_s) = '0'))) then --Start a write transaction
                                        rq_header_valid <= '1';
                                        advance_address(active_descriptor_s) := '1';
                                        rw_state <= CONT_WRITE;
                                        start_transfer := '1';
                                        if DATA_WIDTH = 256 then
                                            if dma_descriptors(active_descriptor_s).dword_count(10 downto 3) /= 1 then
                                                do_re_fifo <= '1';
                                            else
                                                add_delay_cycle <= '1';
                                            end if;
                                            ToHostWriteCount_s <= dma_descriptors(active_descriptor_s).dword_count(10 downto 3); --256 cycles in a TLP, ex header
                                        elsif DATA_WIDTH = 512 then
                                            if dma_descriptors(active_descriptor_s).dword_count(10 downto 4) /= 1 then
                                                do_re_fifo <= '1';
                                            else
                                                add_delay_cycle <= '1';
                                            end if;
                                            ToHostWriteCount_s <= ToHostWriteCount_s + dma_descriptors(active_descriptor_s).dword_count(10 downto 3) - 1; --256 cycles in a TLP, plus 1 for header, but -1 because the first data already gets validated
                                        elsif DATA_WIDTH = 1024 then
                                            if dma_descriptors(active_descriptor_s).dword_count(10 downto 5) /= 1 then
                                                do_re_fifo <= '1';
                                            else
                                                add_delay_cycle <= '1';
                                            end if;
                                            ToHostWriteCount_s <= ToHostWriteCount_s + dma_descriptors(active_descriptor_s).dword_count(10 downto 3) - 3; --256 cycles in a TLP, plus 1 for header, but -3 because the first data already gets validated (+1 -4 => -3)
                                        else
                                            report "Unsupported DATA_WIDTH: " & integer'image(DATA_WIDTH) severity error;
                                        end if;
                                    end if;
                                    if(((dma_descriptors(active_descriptor_s).read_not_write = '1') and (fromHostFifo_prog_full = '0')) and (wait_for_4k_boundary(FromHostIndex_v) = '0')) then --Start a read transaction.
                                        rq_header_valid <= '1';
                                        advance_address(active_descriptor_s) := '1';
                                        rw_state <= DELAY;
                                        start_transfer := '1';
                                        if DATA_WIDTH = 512 or DATA_WIDTH = 1024 then
                                            ToHostWriteCount_s <= ToHostWriteCount_s + 1; --If we still had 1 left, the header needs to straddle, otherwise we put it on position 0.
                                        end if;
                                    end if;
                                end if;
                            end if;
                            if start_transfer = '0' then
                                if(next_active_descriptor_v < NUMBER_OF_DESCRIPTORS_TOHOST) then
                                    toHostFifoIndex_s <= next_active_descriptor_v;
                                end if;
                                active_descriptor_s <= next_active_descriptor_v;
                            end if;
                        when CONT_WRITE  =>
                            rw_state <= CONT_WRITE; --default
                            if (ToHostWriteCount_s > x"02" and DATA_WIDTH = 256) or

                     (ToHostWriteCount_s(7 downto 1) > x"01" and ToHostWriteCount_s(0) = '1' and DATA_WIDTH = 512) or
                     (ToHostWriteCount_s(7 downto 1) > x"02" and ToHostWriteCount_s(0) = '0' and DATA_WIDTH = 512) or

                     (ToHostWriteCount_s(7 downto 2) > x"01" and ToHostWriteCount_s(1 downto 0) = "11" and DATA_WIDTH = 1024) or
                     (ToHostWriteCount_s(7 downto 2) > x"01" and ToHostWriteCount_s(1 downto 0) = "10" and DATA_WIDTH = 1024) or
                     (ToHostWriteCount_s(7 downto 2) > x"01" and ToHostWriteCount_s(1 downto 0) = "01" and DATA_WIDTH = 1024) or
                     (ToHostWriteCount_s(7 downto 2) > x"02" and ToHostWriteCount_s(1 downto 0) = "00" and DATA_WIDTH = 1024)

                     then
                                do_re_fifo <= '1';
                            end if;
                            if (ToHostWriteCount_s = x"01" and DATA_WIDTH = 256) or
                                   (ToHostWriteCount_s <= x"03" and (DATA_WIDTH = 512)) or
                                   (ToHostWriteCount_s <= x"07" and (DATA_WIDTH = 1024)) then
                                if add_delay_cycle = '0' then
                                    rw_state <= IDLE;
                                else
                                    rw_state <= DELAY;
                                end if;
                                active_descriptor_s <= next_active_descriptor_v;
                                if(next_active_descriptor_v < NUMBER_OF_DESCRIPTORS_TOHOST) then
                                    toHostFifoIndex_s <= next_active_descriptor_v;
                                end if;
                            end if;
                        when DELAY =>
                            rw_state <= IDLE;
                            if (DATA_WIDTH=512 or DATA_WIDTH=1024) and ToHostWriteCount_s = x"02" then --If we did a straddled READ cycle, we end up having 2 in this counter, so we have to set it to 0 before going to IDLE.
                                ToHostWriteCount_s <= x"00";
                            end if;
                            active_descriptor_s <= next_active_descriptor_v;
                            dma_wait_s(active_descriptor_s) <= '1'; --Wait one more cycle with the current descriptor to allow circular DMA checks to finish in case of short TLPs (eg read).
                            if(next_active_descriptor_v < NUMBER_OF_DESCRIPTORS_TOHOST) then
                                toHostFifoIndex_s <= next_active_descriptor_v;
                            end if;
                        when others =>
                            rw_state <= IDLE;
                    end case;

                    axis_rq_tuser_bits.is_eop      <= "0000";
                    axis_rq_tuser_bits.is_eop0_ptr <= "00000";
                    axis_rq_tuser_bits.is_sop      <= "0000"; --No TLP starting in this beat
                    axis_rq_tuser_bits.is_sop0_ptr <= "00";
                    if rq_header_valid = '1' then
                        s_m_axis_rq.tvalid <= '1';
                        if DATA_WIDTH = 256 then
                            s_m_axis_rq.tdata(127 downto 0) <= rq_header;
                            s_m_axis_rq.tdata(DATA_WIDTH-1 downto 128) <= rq_data(DATA_WIDTH-129 downto 0);

                            if rq_data_valid = '1' then
                                if ToHostWriteCount_s /= 0 then
                                    ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                end if;
                                s_m_axis_rq.tkeep <= x"000000FF"; --write transaction with data.
                            else
                                s_m_axis_rq.tkeep <= x"0000000F"; --read transaction, only header
                                s_m_axis_rq.tlast <= '1';
                            end if;
                        elsif DATA_WIDTH=512 then --512b interface
                            axis_rq_tuser_bits.is_sop <= "0001"; --Single TLP starting in this beat
                            if ToHostWriteCount_s(0) = '0' then  --Even number means we were frame straddling
                                s_m_axis_rq.tdata(DATA_WIDTH-1 downto 384) <= rq_data(DATA_WIDTH-385 downto 0); -- @suppress "Index out of range: 384 is not in (255 downto 0)"
                                s_m_axis_rq.tdata(DATA_WIDTH-129 downto 256) <= rq_header(DATA_WIDTH-385 downto 0); -- @suppress "Index out of range: 256 is not in (255 downto 0)"
                                s_m_axis_rq.tdata(255 downto 128) <= (others => '0');
                                s_m_axis_rq.tdata(127 downto 0) <= rq_data_p1(DATA_WIDTH-1 downto DATA_WIDTH-128);
                                axis_rq_tuser_bits.is_sop0_ptr <= "10";

                                if rq_data_valid = '1' then
                                    axis_rq_tuser_bits.is_eop      <= "0001";
                                    axis_rq_tuser_bits.is_eop0_ptr <= "00011";
                                    axis_rq_tuser_bits.is_eop1_ptr <= "00000";
                                    if ToHostWriteCount_s /= 0 then
                                        ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                    end if;
                                else --read transaction, end TLP immediately
                                    axis_rq_tuser_bits.is_eop      <= "0011"; --Two TLPs ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3";
                                    axis_rq_tuser_bits.is_eop1_ptr <= "0"&x"B";
                                end if;

                            else
                                s_m_axis_rq.tdata(127 downto 0) <= rq_header;
                                s_m_axis_rq.tdata(DATA_WIDTH-1 downto 128) <= rq_data(DATA_WIDTH-129 downto 0);
                                axis_rq_tuser_bits.is_sop0_ptr <= "00";
                                if rq_data_valid = '1' then
                                    axis_rq_tuser_bits.is_eop      <= "0000";
                                    axis_rq_tuser_bits.is_eop0_ptr <= "00000";
                                    if ToHostWriteCount_s /= 1 then
                                        ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                    end if;
                                else --read transaction, end TLP immediately
                                    axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --byte lane 3 contains last DWORD of header
                                    ToHostWriteCount_s <= x"00";
                                end if;
                            end if;
                        else  --1024 interface
                            axis_rq_tuser_bits.is_sop <= "0001"; --Single TLP starting in this beat
                            case ToHostWriteCount_s(1 downto 0) is
                                when "00" =>--Frame straddling, pipeline data at pos 0, header at 1,  at position 3 out of 0 .. 3
                                    --First 128 bits of new data following header
                                    s_m_axis_rq.tdata(1023 downto 896) <= rq_data(127 downto 0);
                                    --New TLP header bits
                                    s_m_axis_rq.tdata(895 downto 768) <= rq_header;
                                    --128b of padding
                                    s_m_axis_rq.tdata(767 downto 640) <= (others => '0');
                                    --Last 640 bits of previous TLP
                                    s_m_axis_rq.tdata(639 downto 0) <= rq_data_p1(1023 downto 384);
                                    axis_rq_tuser_bits.is_sop0_ptr <= "11"; --Byte lane 96

                                    if rq_data_valid = '1' then
                                        axis_rq_tuser_bits.is_eop      <= "0001";
                                        axis_rq_tuser_bits.is_eop0_ptr <= "1"&x"3"; --DWORD 19 => 0x13
                                        axis_rq_tuser_bits.is_eop1_ptr <= "0"&x"0"; --No second EOP
                                        if ToHostWriteCount_s /= 0 then
                                            ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 4 (4x256b) for 1024b interface
                                        end if;
                                    else --read transaction, end TLP immediately
                                        axis_rq_tuser_bits.is_eop      <= "0011"; --Two TLPs ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "1"&x"3"; --DWORD 19 => 0x13
                                        axis_rq_tuser_bits.is_eop1_ptr <= "1"&x"B"; --DWORD 27 => 0x1B, only header is transmited here.
                                        ToHostWriteCount_s <= (others => '0'); --No more straddling after read request, start a new transaction at 0
                                    end if;
                                when "11" =>--Frame straddling at position 2 out of 0 .. 3
                                    --First 384 bits of payload from new TLP
                                    s_m_axis_rq.tdata(1023 downto 640) <= rq_data(383 downto 0);
                                    --New TLP header bits
                                    s_m_axis_rq.tdata(639 downto 512) <= rq_header;
                                    --128b of padding
                                    s_m_axis_rq.tdata(511 downto 384) <= (others => '0');
                                    --Last 384 bits of previous TLP
                                    s_m_axis_rq.tdata(383 downto 0) <= rq_data_p1(1023 downto 640);
                                    axis_rq_tuser_bits.is_sop0_ptr <= "10"; --Byte lane 64

                                    if rq_data_valid = '1' then
                                        axis_rq_tuser_bits.is_eop      <= "0001";
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"B"; --Ending at DWORD 11 => 0xB
                                        axis_rq_tuser_bits.is_eop1_ptr <= "0"&x"0";
                                        if ToHostWriteCount_s /= 3 then
                                            ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                        end if;
                                    else --read transaction, end TLP immediately
                                        axis_rq_tuser_bits.is_eop      <= "0011"; --Two TLPs ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"B"; --Ending at DWORD 11 => 0xB
                                        axis_rq_tuser_bits.is_eop1_ptr <= "1"&x"3"; --DWORD 19 => 0x13, only header is transmited here.
                                        ToHostWriteCount_s <= (others => '0'); --No more straddling after read request, start a new transaction at 0
                                    end if;
                                when "10" =>--Frame straddling at position 1 out of 0 .. 3
                                    -- First 640 bits of new TLP
                                    s_m_axis_rq.tdata(1023 downto 384) <= rq_data(639 downto 0);
                                    -- New TLP header
                                    s_m_axis_rq.tdata(383 downto 256) <= rq_header;
                                    --128b of padding
                                    s_m_axis_rq.tdata(255 downto 128) <= (others => '0');
                                    --last 128 bits of previous TLP
                                    s_m_axis_rq.tdata(127 downto 0) <= rq_data_p1(1023 downto 896);
                                    axis_rq_tuser_bits.is_sop0_ptr <= "01"; --Byte lane 32

                                    if rq_data_valid = '1' then
                                        axis_rq_tuser_bits.is_eop      <= "0001";
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --First TLP ending at DWORD 3 => 0x3
                                        axis_rq_tuser_bits.is_eop1_ptr <= "0"&x"0";
                                        if ToHostWriteCount_s /= 2 then
                                            ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                        end if;
                                    else --read transaction, end TLP immediately
                                        axis_rq_tuser_bits.is_eop      <= "0011"; --Two TLPs ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --First TLP ending at DWORD 3 => 0x3
                                        axis_rq_tuser_bits.is_eop1_ptr <= "0"&x"B"; --Second TLP ending at DWORD 11 => 0xB
                                        ToHostWriteCount_s <= (others => '0'); --No more straddling after read request, start a new transaction at 0
                                    end if;
                                when "01" => --Not frame straddling
                                    s_m_axis_rq.tdata(1023 downto 128) <= rq_data(895 downto 0);
                                    s_m_axis_rq.tdata(127 downto 0) <= rq_header;
                                    axis_rq_tuser_bits.is_sop0_ptr <= "00"; --TLP starting at byte lane 0
                                    if rq_data_valid = '1' then
                                        axis_rq_tuser_bits.is_eop      <= "0000"; --No TLP ending here
                                        axis_rq_tuser_bits.is_eop0_ptr <= "00000";
                                        if ToHostWriteCount_s /= 1 then
                                            ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                                        end if;
                                    else --read transaction, end TLP immediately
                                        axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --byte lane 3 contains last DWORD of header
                                        ToHostWriteCount_s <= (others => '0'); --No more straddling after read request, start a new transaction at 0
                                    end if;
                                when others =>
                                    NULL;
                            end case;
                        end if;
                        rq_header_valid <= '0';
                    elsif rq_data_valid = '1' or rq_data_valid_p1 = '1' then
                        s_m_axis_rq.tvalid <= '1';
                        if (ToHostWriteCount_s /= x"00" and DATA_WIDTH=256) or
                           (ToHostWriteCount_s /= x"00" and ToHostWriteCount_s /= x"01" and DATA_WIDTH=512) or
                           (ToHostWriteCount_s > x"03" and DATA_WIDTH=1024) then
                            ToHostWriteCount_s <= ToHostWriteCount_s - (1 * (DATA_WIDTH/256)); --count with steps of 2 (2x256b) for 512b interface
                        end if;

                        if (rw_state /= IDLE and ToHostWriteCount_p1_s = x"01" and DATA_WIDTH = 512) then --Last cycle, no new header coming so we don't have to straddle the next TLP.
                            ToHostWriteCount_s <= x"00";
                        end if;
                        if (rw_state /= IDLE and ToHostWriteCount_p1_s < x"04" and DATA_WIDTH = 1024) then --Last cycle, no new header coming so we don't have to straddle the next TLP.
                            ToHostWriteCount_s <= x"00";
                        end if;

                        if DATA_WIDTH = 256 then
                            s_m_axis_rq.tdata(DATA_WIDTH-1 downto 0)  <= rq_data(DATA_WIDTH-129 downto 0) & --128 bits data
                                                                         rq_data_p1(DATA_WIDTH-1 downto DATA_WIDTH-128); --128 bits data from last clock cycle
                            if ToHostWriteCount_s /= 0 then
                                s_m_axis_rq.tlast <= '0';
                                s_m_axis_rq.tkeep  <= x"0000_00FF";
                            else
                                s_m_axis_rq.tlast <= '1';
                                s_m_axis_rq.tkeep <= (others => '0'); --for 16 bit tkeep
                                s_m_axis_rq.tkeep <= x"0000_000F";

                            end if;
                        elsif DATA_WIDTH = 512 then --Data width 512
                            s_m_axis_rq.tlast <= '0';
                            s_m_axis_rq.tkeep <= x"0000_FFFF";
                            if ToHostWriteCount_p1_s(0) = '0' then --Even number means we were frame straddling
                                s_m_axis_rq.tdata(DATA_WIDTH-1 downto 0)  <= rq_data(DATA_WIDTH-385 downto 0) & --128 bits data
                                                                             rq_data_p1(DATA_WIDTH-1 downto DATA_WIDTH-384); --384 bits data from last clock cycle
                                if ToHostWriteCount_p1_s /= 2 then -- greater than 1, but 0 is covered in the else statement.
                                    axis_rq_tuser_bits.is_eop      <= "0000"; --Single TLP ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --byte lane 3 contains last DWORD of header
                                else
                                    axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"B"; --byte lane 11 contains last DWORD
                                end if;
                            else
                                s_m_axis_rq.tdata(DATA_WIDTH-1 downto 0)  <= rq_data(DATA_WIDTH-129 downto 0) & --384 bits data
                                                                             rq_data_p1(DATA_WIDTH-1 downto DATA_WIDTH-128); --128 bits data from last clock cycle
                                if ToHostWriteCount_p1_s /= 1 then -- greater than 1, but 1 is covered in the if statement above.
                                    axis_rq_tuser_bits.is_eop      <= "0000"; --Single TLP ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --byte lane 3 contains last DWORD of header
                                else
                                    axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                    axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --byte lane 3 contains last DWORD
                                end if;
                            end if;
                        else --DATA_WIDTH = 1024
                            s_m_axis_rq.tlast <= '0';
                            s_m_axis_rq.tkeep <= x"FFFF_FFFF";
                            case ToHostWriteCount_p1_s(1 downto 0) is
                                when "01" =>
                                    s_m_axis_rq.tdata(1023 downto 0)  <= rq_data(895 downto 0) & --896 bits data
                                                                         rq_data_p1(1023 downto 896); --128 bits data from last clock cycle
                                    if ToHostWriteCount_p1_s /= 3 then -- greater than 1, but 1 is covered in the if statement above.
                                        axis_rq_tuser_bits.is_eop      <= "0000"; --No TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --
                                    else
                                        axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"3"; --byte lane 27 contains last DWORD data
                                    end if;
                                when "10" => --

                                    s_m_axis_rq.tdata(1023 downto 0)  <= rq_data(639 downto 0) & --640 bits data
                                                                         rq_data_p1(1023 downto 640); --384 bits data from last clock cycle
                                    if ToHostWriteCount_p1_s /= 2 then -- greater than 1, but 0 is covered in the else statement.
                                        axis_rq_tuser_bits.is_eop      <= "0000"; --No TLP ending
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --
                                    else
                                        axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"B"; --byte lane 19 (0x13) contains last DWORD of data
                                    end if;
                                when "11" => --

                                    s_m_axis_rq.tdata(1023 downto 0)  <= rq_data(383 downto 0) & --384 bits data
                                                                         rq_data_p1(1023 downto 384); --640 bits data from last clock cycle
                                    if ToHostWriteCount_p1_s /= 1 then -- greater than 1, but 0 is covered in the else statement.
                                        axis_rq_tuser_bits.is_eop      <= "0000"; --No TLP ending
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --
                                    else
                                        axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "1"&x"3"; --byte lane 11 (0xB) contains last DWORD of data
                                    end if;
                                when "00" => --

                                    s_m_axis_rq.tdata(1023 downto 0)  <= rq_data(127 downto 0) & --128 bits data
                                                                         rq_data_p1(1023 downto 128); --896 bits data from last clock cycle
                                    if ToHostWriteCount_p1_s /= 0 then -- greater than 1, but 0 is covered in the else statement.
                                        axis_rq_tuser_bits.is_eop      <= "0000"; --No TLP ending
                                        axis_rq_tuser_bits.is_eop0_ptr <= "0"&x"0"; --
                                    else
                                        axis_rq_tuser_bits.is_eop      <= "0001"; --Single TLP ending in this beat
                                        axis_rq_tuser_bits.is_eop0_ptr <= "1"&x"B"; --byte lane 3 (0x03) contains last DWORD of data
                                    end if;
                                when others => NULL;
                            end case;

                        end if;
                    end if;

                    for i in 0 to NUMBER_OF_DESCRIPTORS loop
                        address_wrapped_s(i) <= '0';
                        if advance_address_p1(i) = '1'  then
                            if next_address_equals_end_address(i) = '1' then
                                current_address_s(i) <= dma_descriptors(i).start_address;
                                next_address_s(i) <= dma_descriptors(i).start_address+ (dma_descriptors(i).dword_count&"00");
                            else
                                current_address_s(i) <= next_address_s(i);
                                next_address_s(i) <= (next_address_s(i) + (dma_descriptors(i).dword_count&"00"));
                            end if;


                            if(next_address_s(i)=dma_descriptors(i).pc_pointer) and dma_descriptors(i).enable = '1' and i /= NUMBER_OF_DESCRIPTORS then
                                current_address_equals_pc_pointer(i) := '1';
                            else
                                current_address_equals_pc_pointer(i) := '0';
                            end if;
                        else --if dma_descriptors(i).pc_pointer_updated = '1' then
                            if(current_address_s(i)=dma_descriptors(i).pc_pointer) and dma_descriptors(i).enable = '1' and i /= NUMBER_OF_DESCRIPTORS then
                                current_address_equals_pc_pointer(i) := '1';
                            else
                                current_address_equals_pc_pointer(i) := '0';
                            end if;
                        end if;
                        if (dma_descriptors(i).enable = '0' ) then
                            evencycle_dma_s(i) <= '0';
                        end if;
                        if(dma_descriptors(i).wrap_around = '1' and dma_descriptors(i).enable = '1' and (evencycle_dma_s(i) xor dma_descriptors(i).read_not_write) /= dma_descriptors(i).evencycle_pc) then
                            if(current_address_equals_pc_pointer(i) = '1' and i /= NUMBER_OF_DESCRIPTORS) then --The last FromHost descriptor always goes around, never waits for a PC pointer.
                                dma_wait_s(i) <= '1'; --the PC is not ready to accept data, so we have to wait. dma_wait will clear the enable flag of the descriptors towards dma_read_write
                                dma_wait_pc_pointer_s(i) <= '1'; --same as dma_wait, but only for the pc_pointer case. Helper signal to see if we are allowed to wrap around
                            end if;
                        end if;
                        if advance_address_p1(i) = '1' or dma_descriptors(i).wrap_around = '1' then
                            if next_address_s(i) = dma_descriptors(i).end_address and dma_descriptors(i).enable = '1' then
                                next_address_equals_end_address(i) <= '1';
                                dma_wait_s(i) <= '1';--dma_descriptors(i).wrap_around; --give dma_control one extra clock cycle to disable it.
                            else
                                next_address_equals_end_address(i) <= '0';
                            end if;
                        end if;
                        if next_address_equals_end_address(i) = '1' and dma_wait_pc_pointer_s(i) = '0' then
                            if advance_address_p1(i) = '0' then --if 1, this is handled above.
                                next_address_s(i) <= dma_descriptors(i).start_address;
                            end if;
                            next_address_equals_end_address(i) <= '0';
                            evencycle_dma_s(i) <= (not evencycle_dma_s(i)) and dma_descriptors(i).wrap_around;
                            address_wrapped_s(i) <= '1'; --tell dma_control that we wrapped around
                            dma_wait_s(i) <= '1';--dma_descriptors(i).wrap_around; --give dma_control one extra clock cycle to disable it.
                            dma_wait_next_s(i) <= dma_descriptors(i).wrap_around;
                        end if;
                        if advance_address_p1(i) = '1' and  dma_descriptors(i).read_not_write = '1' and dma_descriptors(i).enable = '1' then
                            if next_address_s(i)(11 downto 0) = x"000" then
                                wait_for_4k_boundary(FromHostIndex_v) := '1';
                            end if;
                        end if;
                    end loop;
                    advance_address_p1 := advance_address;
                    if dma_wait_next_s /= (dma_wait_next_s'range => '0') then
                        dma_wait_s <= dma_wait_s or dma_wait_next_s; --This will set dma_wait_s high for two clocks using this mechanism.
                        dma_wait_next_s <= (others => '0');
                    end if;

                    --! Switch between regular fromhost and trickle fromhost descriptor
                    if advance_address_p1(active_descriptor_s) = '1' and  dma_descriptors(active_descriptor_s).read_not_write = '1' and dma_descriptors(active_descriptor_s).enable = '1' then
                        if next_address_s(active_descriptor_s)(11 downto 0) = x"000" or dma_wait_pc_pointer_s(active_descriptor_s) = '1' then
                            if FromHostIndex_v = 0 and dma_descriptors(NUMBER_OF_DESCRIPTORS).enable = '1' then
                                nextFromHostIndex <= 1;
                            end if;
                            if FromHostIndex_v = 1 and dma_descriptors(NUMBER_OF_DESCRIPTORS-1).enable = '1' then
                                nextFromHostIndex <= 0;
                            end if;
                        end if;
                    end if;
                end if; --tready

                --! if only one FromHost descriptor is enabled, always select the enabled one.
                if dma_descriptors(NUMBER_OF_DESCRIPTORS-1).enable = '0' and dma_descriptors(NUMBER_OF_DESCRIPTORS).enable = '1' then
                    nextFromHostIndex <= 1;
                end if;
                if dma_descriptors(NUMBER_OF_DESCRIPTORS-1).enable = '1' and dma_descriptors(NUMBER_OF_DESCRIPTORS).enable = '0' then
                    nextFromHostIndex <= 0;
                end if;



                for i in 0 to NUMBER_OF_DESCRIPTORS loop
                    if dma_descriptors(i).enable = '0' then
                        current_address_s(i) <= dma_descriptors(i).start_address;
                        next_address_s(i) <= (dma_descriptors(i).start_address + (dma_descriptors(i).dword_count&"00"));
                        current_address_equals_pc_pointer(i) := '1'; --In case the pc_pointer is still at start_address,
                    --we may be too late for the comparison,
                    --so initialize it to 1 before enabling,
                    --See Carlos comment from 8-3-2021 on FLX-1442
                    end if;
                end loop;
                if clear_wait_for_4k_boundary = '1' then
                    wait_for_4k_boundary(FromHostIndex_v) := '0';
                end if;
            end if; --clk
        end if; --reset
    end process;

    g0: for i in 0 to (NUMBER_OF_DESCRIPTORS) generate
        dma_status(i).evencycle_dma <= evencycle_dma_s(i);
        dma_status(i).current_address <= current_address_s(i);
        dma_status(i).address_wrapped <= address_wrapped_s(i);
    end generate;


    s_axis_r_rc.tready <= rc_tready_s;  --not fromHostFifo_prog_full;
    rc_tready_s <= not fromHostFifo_full;

    strip_hdr: process(clk)
        --variable receive_word_count_v: std_logic_vector(10 downto 0);
        variable rc_header: std_logic_vector(95 downto 0);
        variable RcStraddlePosition: integer range 0 to DATA_WIDTH/128-1;
    begin
        if(rising_edge(clk)) then
            if(reset = '1') or (dma_soft_reset = '1') then
                --strip_state <= IDLE;
                mem_wea <= "0";
                mem_full <= (others => '0');
                mem_addra <= (others => '0');
                mem_addra_next <= (others => '0');
                mem_addra_next_valid <= '0';
                mem_addrb <= (others => '0');
                fromHostFifo_we <= '0';
                RcStraddlePosition := 0;
            else
                --defaults:
                mem_wea <= "0";
                mem_dina_pipe_valid <= '0';
                mem_addra_next_valid <= '0';


                if mem_dina_pipe_valid = '1' and s_axis_rc.tvalid = '1' and rc_tready_s = '1' then
                    mem_wea <= "1";
                    if axis_rc_tuser_bits.is_sop(0) = '1' and axis_rc_tuser_bits.is_sop0_ptr = "00" then
                        mem_wea <= "0";
                    end if;

                    if mem_addra_next_valid = '1' then
                        mem_addra <= mem_addra_next;
                    else
                        mem_addra <= mem_addra+1;
                    end if;
                    if DATA_WIDTH = 256 then
                        case RcStraddlePosition is
                            when 0 => mem_dina <= s_axis_rc.tdata(95+128*0 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0);
                            when 1 => mem_dina <= s_axis_rc.tdata(95+128*1 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0);
                            when others => NULL;
                        end case;
                    elsif DATA_WIDTH = 512 then
                        case RcStraddlePosition is
                            when 0 => mem_dina <= s_axis_rc.tdata(95+128*0 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0);
                            when 1 => mem_dina <= s_axis_rc.tdata(95+128*1 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0);
                            when 2 => mem_dina <= s_axis_rc.tdata(95+128*2 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*2) downto 0); -- @suppress
                            when 3 => mem_dina <= s_axis_rc.tdata(95+128*3 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*3) downto 0); -- @suppress
                            when others => NULL;
                        end case;
                    else --DATA_WIDTH = 1024
                        case RcStraddlePosition is
                            when 0 => mem_dina <= s_axis_rc.tdata(95+128*0 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0);
                            when 1 => mem_dina <= s_axis_rc.tdata(95+128*1 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0);
                            when 2 => mem_dina <= s_axis_rc.tdata(95+128*2 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*2) downto 0);
                            when 3 => mem_dina <= s_axis_rc.tdata(95+128*3 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*3) downto 0);
                            when 4 => mem_dina <= s_axis_rc.tdata(95+128*4 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*4) downto 0);
                            when 5 => mem_dina <= s_axis_rc.tdata(95+128*5 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*5) downto 0);
                            when 6 => mem_dina <= s_axis_rc.tdata(95+128*6 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*6) downto 0);
                            when 7 => mem_dina <= s_axis_rc.tdata(95+128*7 downto 0) & mem_dina_pipe(DATA_WIDTH-(97+128*7) downto 0);
                            when others => NULL;
                        end case;
                    end if;
                end if;

                if(s_axis_rc.tvalid = '1' and rc_tready_s = '1') then
                    if axis_rc_tuser_bits.is_sop(0) = '1' then
                        RcStraddlePosition := to_integer(unsigned(axis_rc_tuser_bits.is_sop0_ptr));
                        if DATA_WIDTH = 256 then
                            case RcStraddlePosition is
                                when 0 => rc_header := s_axis_rc.tdata(95+128*0 downto 128*0);
                                when 1 => rc_header := s_axis_rc.tdata(95+128*1 downto 128*1);
                                when others => NULL;
                            end case;
                        elsif DATA_WIDTH = 512 then
                            case RcStraddlePosition is
                                when 0 => rc_header := s_axis_rc.tdata(95+128*0 downto 128*0);
                                when 1 => rc_header := s_axis_rc.tdata(95+128*1 downto 128*1);
                                when 2 => rc_header := s_axis_rc.tdata(95+128*2 downto 128*2);
                                when 3 => rc_header := s_axis_rc.tdata(95+128*3 downto 128*3);
                                when others => NULL;
                            end case;
                        else
                            case RcStraddlePosition is
                                when 0 => rc_header := s_axis_rc.tdata(95+128*0 downto 128*0);
                                when 1 => rc_header := s_axis_rc.tdata(95+128*1 downto 128*1);
                                when 2 => rc_header := s_axis_rc.tdata(95+128*2 downto 128*2);
                                when 3 => rc_header := s_axis_rc.tdata(95+128*3 downto 128*3);
                                when 4 => rc_header := s_axis_rc.tdata(95+128*4 downto 128*4);
                                when 5 => rc_header := s_axis_rc.tdata(95+128*5 downto 128*5);
                                when 6 => rc_header := s_axis_rc.tdata(95+128*6 downto 128*6);
                                when 7 => rc_header := s_axis_rc.tdata(95+128*7 downto 128*7);
                                when others => NULL;
                            end case;

                        end if;
                        --receive_word_count <= rc_header(42 downto 32);
                        mem_addra_next <= rc_header(11 downto f_log2(DATA_WIDTH)-3);
                        mem_addra_next_valid <= '1';
                    end if;
                    if DATA_WIDTH = 256 then
                        case RcStraddlePosition is
                            when 0 => mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*0); --pipeline 160 bits of data
                            when 1 => mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*1); --pipeline 160 bits of data
                            when others => NULL;
                        end case;
                    elsif DATA_WIDTH = 512 then
                        case RcStraddlePosition is
                            when 0 => mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*0); --pipeline 160 bits of data
                            when 1 => mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*1); --pipeline 160 bits of data
                            when 2 => mem_dina_pipe(DATA_WIDTH-(97+128*2) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*2); --pipeline 160 bits of data  -- @suppress
                            when 3 => mem_dina_pipe(DATA_WIDTH-(97+128*3) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*3); --pipeline 160 bits of data  -- @suppress
                            when others => NULL;
                        end case;
                    else --DATA_WIDTH = 1024
                        case RcStraddlePosition is
                            when 0 => mem_dina_pipe(DATA_WIDTH-(97+128*0) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*0); --pipeline 160 bits of data
                            when 1 => mem_dina_pipe(DATA_WIDTH-(97+128*1) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*1); --pipeline 160 bits of data
                            when 2 => mem_dina_pipe(DATA_WIDTH-(97+128*2) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*2); --pipeline 160 bits of data  -- @suppress
                            when 3 => mem_dina_pipe(DATA_WIDTH-(97+128*3) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*3); --pipeline 160 bits of data  -- @suppress
                            when 4 => mem_dina_pipe(DATA_WIDTH-(97+128*4) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*4); --pipeline 160 bits of data
                            when 5 => mem_dina_pipe(DATA_WIDTH-(97+128*5) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*5); --pipeline 160 bits of data
                            when 6 => mem_dina_pipe(DATA_WIDTH-(97+128*6) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*6); --pipeline 160 bits of data  -- @suppress
                            when 7 => mem_dina_pipe(DATA_WIDTH-(97+128*7) downto 0) <= s_axis_rc.tdata(DATA_WIDTH-1 downto 96+128*7); --pipeline 160 bits of data  -- @suppress
                            when others => NULL;
                        end case;

                    end if;
                    mem_dina_pipe_valid <= '1';
                end if;

                if mem_wea = "1" then
                    mem_full(to_integer(unsigned(mem_addra))) <= '1';
                end if;
                --! Read out memory and write into fromHostFifo
                clear_wait_for_4k_boundary <= '0';
                fromHostFifo_we <= '0';
                mem_full_p1 <= mem_full;
                if reading_mem = '0' then --We are not in the process of dumping the complete memory into the fifo, start at 0
                    if(mem_full_p1(0) = '1') then
                        mem_addrb <= mem_addrb + 1;
                        reading_mem <= '1';
                        fromHostFifo_we <= '1'; --we will write the fifo in the next cycle, when the ram is read out.
                        mem_full(0) <= '0';
                    end if;
                else
                    if(mem_full_p1(to_integer(unsigned(mem_addrb))) = '1') then
                        if(mem_addrb /= (mem_addrb'range => '1')) then
                            mem_addrb <= mem_addrb + 1;
                        else
                            reading_mem <= '0';
                            mem_addrb <= (others => '0');
                            clear_wait_for_4k_boundary <= '1';
                        end if;
                        fromHostFifo_we <= '1'; --we will write the fifo in the next cycle, when the ram is read out.
                        mem_full(to_integer(unsigned(mem_addrb))) <= '0';
                    else
                        if mem_full(0) = '1' then --if address 0 is written, go back immediately
                            reading_mem <= '0';
                            mem_addrb <= (others => '0');
                            clear_wait_for_4k_boundary <= '1';
                        end if;
                    end if;
                end if;


            end if; --clk
        end if; --reset
    end process;


    fromHostFifo_din <= mem_doutb;

    rc_interface_mem : xpm_memory_sdpram
        generic map ( -- @suppress "Generic map uses default values. Missing optional actuals: USE_EMBEDDED_CONSTRAINT, CASCADE_HEIGHT, SIM_ASSERT_CHK, RST_MODE_A, RST_MODE_B" -- @suppress "Generic map uses default values. Missing optional actuals: USE_MEM_INIT_MMI, USE_EMBEDDED_CONSTRAINT, CASCADE_HEIGHT, SIM_ASSERT_CHK, WRITE_PROTECT, RST_MODE_A, RST_MODE_B"
            ADDR_WIDTH_A => 15-f_log2(DATA_WIDTH),
            ADDR_WIDTH_B => 15-f_log2(DATA_WIDTH),
            AUTO_SLEEP_TIME => 0,
            BYTE_WRITE_WIDTH_A => DATA_WIDTH,
            CLOCKING_MODE => "common_clock",
            ECC_MODE => "no_ecc",
            MEMORY_INIT_FILE => "none",
            MEMORY_INIT_PARAM => "0",
            MEMORY_OPTIMIZATION => "true",
            MEMORY_PRIMITIVE => "auto",
            MEMORY_SIZE => 32768,
            MESSAGE_CONTROL => 0,
            READ_DATA_WIDTH_B => DATA_WIDTH,
            READ_LATENCY_B => 1,
            READ_RESET_VALUE_B => "0",
            USE_MEM_INIT => 1,
            WAKEUP_TIME => "disable_sleep",
            WRITE_DATA_WIDTH_A => DATA_WIDTH,
            WRITE_MODE_B => "no_change"
        )
        port map (
            sleep => '0',
            clka => clk,
            ena => '1',
            wea => mem_wea,
            addra => mem_addra,
            dina => mem_dina,
            injectsbiterra => '0',
            injectdbiterra => '0',
            clkb => clk,
            rstb => reset,
            enb => '1',
            regceb => '1',
            addrb => mem_addrb,
            doutb => mem_doutb,
            sbiterrb => open,
            dbiterrb => open
        );
end architecture rtl ; -- of dma_read_write
